This file is designed to use CDC data to assess coronavirus disease burden by state, including creating and analyzing state-level cluters.
Through March 7, 2021, The COVID Tracking Project collected and integrated data on tests, cases, hospitalizations, deaths, and the like by state and date. The latest code for using this data is available in Coronavirus_Statistics_CTP_v004.Rmd.
The COVID Tracking Project suggest that US federal data sources are now sufficiently robust to be used for analyses that previously relied on COVID Tracking Project. This code is an attempt to update modules in Coronavirus_Statistics_CTP_v004.Rmd to leverage US federal data.
The code in this module builds on code available in _v001, and splits many functions in to two main .R files that can be sourced:
Broadly, the CDC data analyzed by this module includes:
The tidyverse package is loaded and functions are sourced:
# The tidyverse functions are routinely used without package::function format
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# Functions are available in source file
source("./Generic_Added_Utility_Functions_202105_v001.R")
source("./Coronavirus_CDC_Daily_Functions_v001.R")
A series of mapping files are also available to allow for parameterized processing. Mappings include:
These default parameters are maintained in a separate .R file and can be sourced:
source("./Coronavirus_CDC_Daily_Default_Mappings_v002.R")
Additionally, a mapping file could be maintained to give default plotting labels to variables. This is currently not used by any of the sourced functions:
# Create a variable mapping file - this is currently redundant
varMapper <- c()
Code from the previous model is run, with results compared to previous results:
readList <- list("cdcDaily"="./RInputFiles/Coronavirus/CDC_dc_downloaded_210502.csv",
"cdcHosp"="./RInputFiles/Coronavirus/CDC_h_downloaded_210509.csv"
)
cdc_daily_compare <- readRunCDCDaily(thruLabel="May 2, 2021",
readFrom=readList,
compareFile=list("cdcDaily"=colRenamer(readFromRDS("dfRaw_dc_210414"),
c('new_case'='new_cases',
'tot_death'='tot_deaths',
'new_death'='new_deaths'
)
),
"cdcHosp"=readFromRDS("dfHosp_old")
),
writeLog="./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v002.log",
ovrwriteLog=TRUE,
dfPerCapita=NULL,
useClusters=readFromRDS("cdc_daily_test_v2")$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/CDC_dc_downloaded_210502.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## submission_date = col_character(),
## state = col_character(),
## tot_cases = col_double(),
## conf_cases = col_double(),
## prob_cases = col_double(),
## new_case = col_double(),
## pnew_case = col_double(),
## tot_death = col_double(),
## conf_death = col_double(),
## prob_death = col_double(),
## new_death = col_double(),
## pnew_death = col_double(),
## created_at = col_character(),
## consent_cases = col_character(),
## consent_deaths = col_character()
## )
##
## *** File has been checked for uniqueness by: state date
##
##
## Checking for similarity of: column names
## In reference but not in current: naconf
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 18
## Detailed differences available in: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v002.log
##
## Checking for similarity of: state
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 97 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v002.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 14 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v002.log
##
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/CDC_h_downloaded_210509.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## state = col_character(),
## date = col_date(format = ""),
## geocoded_state = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: state date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference: previous_day_admission_adult_covid_confirmed_18-19 previous_day_admission_adult_covid_confirmed_18-19_coverage previous_day_admission_adult_covid_confirmed_20-29 previous_day_admission_adult_covid_confirmed_20-29_coverage previous_day_admission_adult_covid_confirmed_30-39 previous_day_admission_adult_covid_confirmed_30-39_coverage previous_day_admission_adult_covid_confirmed_40-49 previous_day_admission_adult_covid_confirmed_40-49_coverage previous_day_admission_adult_covid_confirmed_50-59 previous_day_admission_adult_covid_confirmed_50-59_coverage previous_day_admission_adult_covid_confirmed_60-69 previous_day_admission_adult_covid_confirmed_60-69_coverage previous_day_admission_adult_covid_confirmed_70-79 previous_day_admission_adult_covid_confirmed_70-79_coverage previous_day_admission_adult_covid_confirmed_80+ previous_day_admission_adult_covid_confirmed_80+_coverage previous_day_admission_adult_covid_confirmed_unknown previous_day_admission_adult_covid_confirmed_unknown_coverage previous_day_admission_adult_covid_suspected_18-19 previous_day_admission_adult_covid_suspected_18-19_coverage previous_day_admission_adult_covid_suspected_20-29 previous_day_admission_adult_covid_suspected_20-29_coverage previous_day_admission_adult_covid_suspected_30-39 previous_day_admission_adult_covid_suspected_30-39_coverage previous_day_admission_adult_covid_suspected_40-49 previous_day_admission_adult_covid_suspected_40-49_coverage previous_day_admission_adult_covid_suspected_50-59 previous_day_admission_adult_covid_suspected_50-59_coverage previous_day_admission_adult_covid_suspected_60-69 previous_day_admission_adult_covid_suspected_60-69_coverage previous_day_admission_adult_covid_suspected_70-79 previous_day_admission_adult_covid_suspected_70-79_coverage previous_day_admission_adult_covid_suspected_80+ previous_day_admission_adult_covid_suspected_80+_coverage previous_day_admission_adult_covid_suspected_unknown previous_day_admission_adult_covid_suspected_unknown_coverage
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 15
## Detailed differences available in: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v002.log
##
## Checking for similarity of: state
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 6 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v002.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 63 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v002.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 6
## isType tot_cases tot_deaths new_cases new_deaths n
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 before 5.08e+9 1.07e+8 3.21e+7 558830 27435
## 2 after 5.06e+9 1.06e+8 3.19e+7 556355 23715
## 3 pctchg 4.40e-3 3.81e-3 4.47e-3 0.00443 0.136
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 5
## isType inp hosp_adult hosp_ped n
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 before 2.57e+7 1.99e+7 436353 23230
## 2 after 2.56e+7 1.98e+7 426239 22395
## 3 pctchg 5.60e-3 5.66e-3 0.0232 0.0359
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded datum unknown in CRS definition
identical(cdc_daily_compare[c("stateData", "dfRaw", "dfProcess", "dfPerCapita", "useClusters")],
readFromRDS("cdc_daily_test_v3")[c("stateData", "dfRaw", "dfProcess", "dfPerCapita", "useClusters")]
)
## [1] TRUE
identical(cdc_daily_compare$plotDataList[c("dfFull", "dfAgg", "plotClusters")],
readFromRDS("cdc_daily_test_v3")$plotDataList[c("dfFull", "dfAgg", "plotClusters")]
)
## [1] TRUE
The core data elements are identical, and the plots appear to convey the same information. Next steps are to download the latest data and process with existing clusters.
Updated data are downloaded and processed, using existing segments. The downloadTo argument is edited using lapply to avoid downloading data if it has previously been downloaded:
readList <- list("cdcDaily"="./RInputFiles/Coronavirus/CDC_dc_downloaded_210528.csv",
"cdcHosp"="./RInputFiles/Coronavirus/CDC_h_downloaded_210528.csv"
)
compareList <- list("cdcDaily"=readFromRDS("cdc_daily_test_v3")$dfRaw$cdcDaily,
"cdcHosp"=readFromRDS("cdc_daily_test_v3")$dfRaw$cdcHosp
)
cdc_daily_210528 <- readRunCDCDaily(thruLabel="May 28, 2021",
downloadTo=lapply(readList, FUN=function(x) if(file.exists(x)) NA else x),
readFrom=readList,
compareFile=compareList,
writeLog="./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210528.log",
useClusters=readFromRDS("cdc_daily_test_v2")$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## -- Column specification --------------------------------------------------------
## cols(
## submission_date = col_character(),
## state = col_character(),
## tot_cases = col_double(),
## conf_cases = col_double(),
## prob_cases = col_double(),
## new_case = col_double(),
## pnew_case = col_double(),
## tot_death = col_double(),
## conf_death = col_double(),
## prob_death = col_double(),
## new_death = col_double(),
## pnew_death = col_double(),
## created_at = col_character(),
## consent_cases = col_character(),
## consent_deaths = col_character()
## )
##
## *** File has been checked for uniqueness by: state date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 26
## Detailed differences available in: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210528.log
##
## Checking for similarity of: state
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 593 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210528.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 39 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210528.log
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## state = col_character(),
## date = col_date(format = ""),
## geocoded_state = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: state date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 14
## Detailed differences available in: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210528.log
##
## Checking for similarity of: state
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 3 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210528.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 49 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210528.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 6
## isType tot_cases tot_deaths new_cases new_deaths n
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 before 5.99e+9 1.24e+8 3.29e+7 577667 28969
## 2 after 5.96e+9 1.23e+8 3.28e+7 575010 25041
## 3 pctchg 4.37e-3 3.82e-3 4.55e-3 0.00460 0.136
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 5
## isType inp hosp_adult hosp_ped n
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 before 2.61e+7 2.03e+7 415621 23972
## 2 after 2.60e+7 2.02e+7 405188 23109
## 3 pctchg 5.67e-3 5.73e-3 0.0251 0.0360
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded datum unknown in CRS definition
saveToRDS(cdc_daily_210528, ovrWrite=FALSE, ovrWriteError=FALSE)
The process appears to work as intended. Next steps are to update the county-level data process, making use of some of the functions available for CDC data processing.
The latest version of the data are downloaded and processed:
readList <- list("cdcDaily"="./RInputFiles/Coronavirus/CDC_dc_downloaded_210708.csv",
"cdcHosp"="./RInputFiles/Coronavirus/CDC_h_downloaded_210708.csv"
)
compareList <- list("cdcDaily"=readFromRDS("cdc_daily_210528")$dfRaw$cdcDaily,
"cdcHosp"=readFromRDS("cdc_daily_210528")$dfRaw$cdcHosp
)
cdc_daily_210708 <- readRunCDCDaily(thruLabel="Jul 08, 2021",
downloadTo=lapply(readList, FUN=function(x) if(file.exists(x)) NA else x),
readFrom=readList,
compareFile=compareList,
writeLog="./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210708.log",
useClusters=readFromRDS("cdc_daily_210528")$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## -- Column specification --------------------------------------------------------
## cols(
## submission_date = col_character(),
## state = col_character(),
## tot_cases = col_double(),
## conf_cases = col_double(),
## prob_cases = col_double(),
## new_case = col_double(),
## pnew_case = col_double(),
## tot_death = col_double(),
## conf_death = col_double(),
## prob_death = col_double(),
## new_death = col_double(),
## pnew_death = col_double(),
## created_at = col_character(),
## consent_cases = col_character(),
## consent_deaths = col_character()
## )
##
## *** File has been checked for uniqueness by: state date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 40
## Detailed differences available in: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210708.log
##
## Checking for similarity of: state
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 432 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210708.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 43 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210708.log
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## state = col_character(),
## date = col_date(format = ""),
## geocoded_state = col_logical()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: state date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference: deaths_covid deaths_covid_coverage
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 42
## Detailed differences available in: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210708.log
##
## Checking for similarity of: state
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 3 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210708.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 57 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210708.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 6
## isType tot_cases tot_deaths new_cases new_deaths n
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 before 7.32e+9 1.49e+8 3.35e+7 596979 31329
## 2 after 7.29e+9 1.48e+8 3.33e+7 594255 27081
## 3 pctchg 4.40e-3 3.91e-3 4.57e-3 0.00456 0.136
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 5
## isType inp hosp_adult hosp_ped n
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 before 2.70e+7 2.11e+7 447142 26198
## 2 after 2.69e+7 2.10e+7 435737 25251
## 3 pctchg 5.65e-3 5.67e-3 0.0255 0.0361
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded datum unknown in CRS definition
saveToRDS(cdc_daily_210708, ovrWrite=FALSE, ovrWriteError=FALSE)
Vaccines data are also available for download on the CDC website:
urlVaccine <- "https://data.cdc.gov/api/views/unsk-b7fc/rows.csv?accessType=DOWNLOAD"
locVaccine <- "./RInputFiles/Coronavirus/CDC_vax_downloaded_210712.csv"
fileDownload(locVaccine, urlVaccine)
## size isdir mode
## ./RInputFiles/Coronavirus/CDC_vax_downloaded_210712.csv 4270315 FALSE 666
## mtime
## ./RInputFiles/Coronavirus/CDC_vax_downloaded_210712.csv 2021-07-12 09:01:36
## ctime
## ./RInputFiles/Coronavirus/CDC_vax_downloaded_210712.csv 2021-07-12 09:01:11
## atime exe
## ./RInputFiles/Coronavirus/CDC_vax_downloaded_210712.csv 2021-07-12 09:01:36 no
The file has many fields, including:
An individual can live in one state but be vaccinated in another state. Per the CDC field descriptions:
Fully vaccinated (series complete) metrics is defined as “Total number of people who are fully vaccinated (have second dose of a two-dose vaccine or one dose of a single-dose vaccine) based on the jurisdiction where recipient lives”
vaxRaw_210712 <- fileRead(locVaccine)
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## Date = col_character(),
## Location = col_character()
## )
## i Use `spec()` for the full column specifications.
glimpse(vaxRaw_210712)
## Rows: 13,618
## Columns: 69
## $ Date <chr> "07/11/2021", "07/11/2021", "07~
## $ MMWR_week <dbl> 28, 28, 28, 28, 28, 28, 28, 28,~
## $ Location <chr> "FL", "IA", "WI", "MO", "ND", "~
## $ Distributed <dbl> 25229075, 3506895, 6207245, 620~
## $ Distributed_Janssen <dbl> 1694500, 188700, 318700, 311400~
## $ Distributed_Moderna <dbl> 10217260, 1460040, 2633920, 254~
## $ Distributed_Pfizer <dbl> 13317315, 1858155, 3254625, 334~
## $ Distributed_Unk_Manuf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ Dist_Per_100K <dbl> 117466, 111151, 106609, 101065,~
## $ Distributed_Per_100k_12Plus <dbl> 134944, 131036, 124150, 118471,~
## $ Distributed_Per_100k_18Plus <dbl> 146274, 144422, 136248, 130124,~
## $ Distributed_Per_100k_65Plus <dbl> 560978, 634211, 610203, 584049,~
## $ Administered <dbl> 21527263, 3073527, 6017859, 520~
## $ Administered_12Plus <dbl> 21519017, 3073495, 6017495, 520~
## $ Administered_18Plus <dbl> 20764735, 2932330, 5732822, 499~
## $ Administered_65Plus <dbl> 7906498, 959982, 1837143, 16454~
## $ Administered_Janssen <dbl> 1048774, 128869, 240681, 174610~
## $ Administered_Moderna <dbl> 8579143, 1297742, 2470502, 1977~
## $ Administered_Pfizer <dbl> 11846137, 1646788, 3306004, 305~
## $ Administered_Unk_Manuf <dbl> 53209, 128, 672, 466, 0, 2580, ~
## $ Administered_Fed_LTC <dbl> 405647, 138684, 182382, 158723,~
## $ Administered_Fed_LTC_Residents <dbl> 209000, 62049, 85961, 85652, 30~
## $ Administered_Fed_LTC_Staff <dbl> 119292, 45853, 59621, 49923, 22~
## $ Administered_Fed_LTC_Unk <dbl> 77355, 30782, 36800, 23148, 137~
## $ Administered_Fed_LTC_Dose1 <dbl> 230126, 87469, 115893, 93047, 3~
## $ Administered_Fed_LTC_Dose1_Residents <dbl> 117587, 35533, 50724, 48321, 15~
## $ Administered_Fed_LTC_Dose1_Staff <dbl> 67708, 28547, 36168, 29112, 117~
## $ Administered_Fed_LTC_Dose1_Unk <dbl> 44831, 23389, 29001, 15614, 811~
## $ Admin_Per_100K <dbl> 100231, 97415, 103356, 84885, 8~
## $ Admin_Per_100k_12Plus <dbl> 115099, 114842, 120355, 99498, ~
## $ Admin_Per_100k_18Plus <dbl> 120391, 120760, 125835, 104872,~
## $ Admin_Per_100k_65Plus <dbl> 175804, 173610, 180600, 154933,~
## $ Recip_Administered <dbl> 21237913, 3069562, 5974955, 511~
## $ Administered_Dose1_Recip <dbl> 11763654, 1638173, 3163125, 281~
## $ Administered_Dose1_Pop_Pct <dbl> 54.8, 51.9, 54.3, 45.9, 44.4, 5~
## $ Administered_Dose1_Recip_12Plus <dbl> 11756137, 1638108, 3162679, 281~
## $ Administered_Dose1_Recip_12PlusPop_Pct <dbl> 62.9, 61.2, 63.3, 53.8, 53.0, 6~
## $ Administered_Dose1_Recip_18Plus <dbl> 11323495, 1562036, 3007052, 269~
## $ Administered_Dose1_Recip_18PlusPop_Pct <dbl> 65.7, 64.3, 66.0, 56.6, 56.0, 7~
## $ Administered_Dose1_Recip_65Plus <dbl> 4061097, 490657, 920145, 859645~
## $ Administered_Dose1_Recip_65PlusPop_Pct <dbl> 90.3, 88.7, 90.5, 80.9, 83.5, 8~
## $ Series_Complete_Yes <dbl> 10086805, 1537214, 2951037, 243~
## $ Series_Complete_Pop_Pct <dbl> 47.0, 48.7, 50.7, 39.7, 39.4, 5~
## $ Series_Complete_12Plus <dbl> 10085351, 1537191, 2950892, 243~
## $ Series_Complete_12PlusPop_Pct <dbl> 53.9, 57.4, 59.0, 46.6, 47.0, 6~
## $ Series_Complete_18Plus <dbl> 9776152, 1473385, 2825253, 2353~
## $ Series_Complete_18PlusPop_Pct <dbl> 56.7, 60.7, 62.0, 49.4, 50.0, 6~
## $ Series_Complete_65Plus <dbl> 3551211, 475114, 889344, 779851~
## $ Series_Complete_65PlusPop_Pct <dbl> 79.0, 85.9, 87.4, 73.4, 74.5, 8~
## $ Series_Complete_Janssen <dbl> 1031811, 126334, 232849, 175144~
## $ Series_Complete_Moderna <dbl> 3807918, 629990, 1161367, 90416~
## $ Series_Complete_Pfizer <dbl> 5229909, 780797, 1556520, 13597~
## $ Series_Complete_Unk_Manuf <dbl> 17167, 93, 301, 101, 1, 792, 75~
## $ Series_Complete_Janssen_12Plus <dbl> 1031093, 126332, 232832, 175129~
## $ Series_Complete_Moderna_12Plus <dbl> 3807322, 629983, 1161353, 90415~
## $ Series_Complete_Pfizer_12Plus <dbl> 5229769, 780783, 1556406, 13597~
## $ Series_Complete_Unk_Manuf_12Plus <dbl> 17167, 93, 301, 101, 1, 792, 74~
## $ Series_Complete_Janssen_18Plus <dbl> 1030595, 126273, 232707, 174990~
## $ Series_Complete_Moderna_18Plus <dbl> 3806853, 629858, 1161109, 90392~
## $ Series_Complete_Pfizer_18Plus <dbl> 4921576, 717161, 1431144, 12746~
## $ Series_Complete_Unk_Manuf_18Plus <dbl> 17128, 93, 293, 93, 1, 781, 742~
## $ Series_Complete_Janssen_65Plus <dbl> 179075, 11728, 24812, 33179, 35~
## $ Series_Complete_Moderna_65Plus <dbl> 1755611, 252070, 432381, 357176~
## $ Series_Complete_Pfizer_65Plus <dbl> 1604988, 211256, 432022, 389451~
## $ Series_Complete_Unk_Manuf_65Plus <dbl> 11537, 60, 129, 45, 0, 464, 326~
## $ Series_Complete_FedLTC <dbl> 174063, 50507, 65859, 65388, 30~
## $ Series_Complete_FedLTC_Residents <dbl> 89676, 26063, 34733, 36971, 141~
## $ Series_Complete_FedLTC_Staff <dbl> 50661, 16950, 23251, 20660, 105~
## $ Series_Complete_FedLTC_Unknown <dbl> 33726, 7494, 7875, 7757, 552, 9~
vaxRenamer <- c("Location"="state",
"Date"="date",
"Admin_Per_100K"="Admin_Per_100k"
)
vaxKeeper <- c("state", "date", "MMWR_week",
"Administered", "Administered_12Plus", "Administered_18Plus", "Administered_65Plus",
"Admin_Per_100k", "Admin_Per_100k_12Plus", "Admin_Per_100k_18Plus", "Admin_Per_100k_65Plus",
"Recip_Administered",
"Series_Complete_Yes",
"Series_Complete_12Plus", "Series_Complete_18Plus", "Series_Complete_65Plus",
"Series_Complete_Pop_Pct",
"Series_Complete_12PlusPop_Pct", "Series_Complete_18PlusPop_Pct", "Series_Complete_65PlusPop_Pct"
)
vaxProcessed_210712 <- vaxRaw_210712 %>%
colRenamer(vecRename=vaxRenamer) %>%
colSelector(vecSelect=vaxKeeper) %>%
colMutater(selfList=list("date"=lubridate::mdy))
glimpse(vaxProcessed_210712)
## Rows: 13,618
## Columns: 20
## $ state <chr> "FL", "IA", "WI", "MO", "ND", "VA", "US"~
## $ date <date> 2021-07-11, 2021-07-11, 2021-07-11, 202~
## $ MMWR_week <dbl> 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, ~
## $ Administered <dbl> 21527263, 3073527, 6017859, 5209747, 649~
## $ Administered_12Plus <dbl> 21519017, 3073495, 6017495, 5209418, 648~
## $ Administered_18Plus <dbl> 20764735, 2932330, 5732822, 4999072, 626~
## $ Administered_65Plus <dbl> 7906498, 959982, 1837143, 1645444, 19396~
## $ Admin_Per_100k <dbl> 100231, 97415, 103356, 84885, 85282, 110~
## $ Admin_Per_100k_12Plus <dbl> 115099, 114842, 120355, 99498, 101825, 1~
## $ Admin_Per_100k_18Plus <dbl> 120391, 120760, 125835, 104872, 107725, ~
## $ Admin_Per_100k_65Plus <dbl> 175804, 173610, 180600, 154933, 161847, ~
## $ Recip_Administered <dbl> 21237913, 3069562, 5974955, 5114570, 618~
## $ Series_Complete_Yes <dbl> 10086805, 1537214, 2951037, 2439175, 300~
## $ Series_Complete_12Plus <dbl> 10085351, 1537191, 2950892, 2439129, 299~
## $ Series_Complete_18Plus <dbl> 9776152, 1473385, 2825253, 2353696, 2909~
## $ Series_Complete_65Plus <dbl> 3551211, 475114, 889344, 779851, 89281, ~
## $ Series_Complete_Pop_Pct <dbl> 47.0, 48.7, 50.7, 39.7, 39.4, 52.9, 48.0~
## $ Series_Complete_12PlusPop_Pct <dbl> 53.9, 57.4, 59.0, 46.6, 47.0, 61.6, 56.1~
## $ Series_Complete_18PlusPop_Pct <dbl> 56.7, 60.7, 62.0, 49.4, 50.0, 63.7, 58.8~
## $ Series_Complete_65PlusPop_Pct <dbl> 79.0, 85.9, 87.4, 73.4, 74.5, 81.4, 79.0~
Counts by state are created:
vaxState <- vaxProcessed_210712 %>%
group_by(state) %>%
filter(date==max(date)) %>%
select(state, date, Administered, Recip_Administered, Series_Complete_Yes) %>%
ungroup() %>%
arrange(-Administered)
vaxState
## # A tibble: 65 x 5
## state date Administered Recip_Administered Series_Complete_Yes
## <chr> <date> <dbl> <dbl> <dbl>
## 1 US 2021-07-11 334151648 334151648 159266536
## 2 CA 2021-07-11 43609176 43607956 20176353
## 3 TX 2021-07-11 26245668 25536886 12230164
## 4 NY 2021-07-11 22233988 22166452 10763740
## 5 FL 2021-07-11 21527263 21237913 10086805
## 6 PA 2021-07-11 14126934 14159474 6486641
## 7 IL 2021-07-11 13206252 13344907 5971607
## 8 OH 2021-07-11 10835735 10710147 5318622
## 9 NJ 2021-07-11 10029522 10332551 5006341
## 10 MI 2021-07-11 9562802 9766213 4780127
## # ... with 55 more rows
vaxState %>%
filter(!(state %in% c(state.abb, "DC")))
## # A tibble: 14 x 5
## state date Administered Recip_Administered Series_Complete_Yes
## <chr> <date> <dbl> <dbl> <dbl>
## 1 US 2021-07-11 334151648 334151648 159266536
## 2 LTC 2021-07-11 7899665 0 0
## 3 VA2 2021-07-11 5381413 5381413 2706838
## 4 DD2 2021-07-11 4382578 4382578 1888769
## 5 PR 2021-07-11 3832854 3860036 1839207
## 6 IH2 2021-07-11 1459669 1459669 668566
## 7 BP2 2021-07-11 197049 197049 97863
## 8 GU 2021-07-11 194248 194467 93628
## 9 VI 2021-07-11 79692 77067 35899
## 10 MP 2021-07-11 57308 57358 27509
## 11 FM 2021-07-11 51997 52375 26444
## 12 AS 2021-07-11 48178 48436 21997
## 13 MH 2021-07-11 34127 34184 16365
## 14 RP 2021-07-11 25416 25637 13284
vaxState %>%
filter(!(state == "US")) %>%
mutate(pctComplete=Series_Complete_Yes/sum(Series_Complete_Yes)) %>%
mutate(is50DC=state %in% c(state.abb, "DC")) %>%
group_by(is50DC) %>%
summarize(n=n(), across(where(is.numeric), sum), .groups="drop")
## # A tibble: 2 x 6
## is50DC n Administered Recip_Administered Series_Complete_Yes pctComplete
## <lgl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 FALSE 13 23644194 15770269 7436369 0.0455
## 2 TRUE 51 328226539 327261454 156057188 0.955
vaxProcessed_210712 %>%
filter(state=="US") %>%
select(state, date, Administered, Recip_Administered, Series_Complete_Yes) %>%
pivot_longer(-c(state, date)) %>%
ggplot(aes(x=date, y=value/1000000)) +
geom_line(aes(group=name, color=name)) +
labs(x="", y="Number of Doses/People (millions)", title="All-US Vaccination totals")
Roughly 5% of completely vaccinated individuals are tracked to entities that do not map back to states. These will be deleted for further analysis, which may lead to some disconnects.
Next steps are to continue processing the data and to integrate with the other state-level metrics.
Implied populations and vaccinations by subgroup are calculated:
vaxImplied_210712 <- vaxProcessed_210712 %>%
mutate(popTot=100*Series_Complete_Yes/Series_Complete_Pop_Pct,
pop65Plus=100*Series_Complete_65Plus/Series_Complete_65PlusPop_Pct,
pop18Plus=100*Series_Complete_18Plus/Series_Complete_18PlusPop_Pct,
pop12Plus=100*Series_Complete_12Plus/Series_Complete_12PlusPop_Pct,
pop1864=pop18Plus-pop65Plus,
pop1217=pop12Plus-pop18Plus,
pop0011=popTot-pop12Plus,
vax65Plus=Series_Complete_65Plus,
vax1864=Series_Complete_18Plus-Series_Complete_65Plus,
vax1217=Series_Complete_12Plus-Series_Complete_18Plus,
vax0011=Series_Complete_Yes-Series_Complete_12Plus
)
popData <- vaxImplied_210712 %>%
filter(state %in% c(state.abb, "DC", "PR", "US")) %>%
group_by(state) %>%
summarize(across(.cols=c(pop65Plus, pop1864, pop1217, pop0011),
.fns=list(mu=~mean(.x, na.rm=TRUE),
sdmu=~sd(.x, na.rm=TRUE)/mean(.x, na.rm=TRUE),
rangemu=~diff(range(.x, na.rm=TRUE)/mean(.x, na.rm=TRUE))
)
),
.groups="drop"
)
popData %>%
select(state, contains("_rangemu")) %>%
pivot_longer(-state) %>%
ggplot(aes(x=fct_reorder(state, value, .fun=max), y=value)) +
geom_point() +
coord_flip() +
facet_wrap(~name, nrow=1) +
labs(y="Range divided by mean", x=NULL, title="Consistency of population estimates by subgroup and state")
## Warning: Removed 2 rows containing missing values (geom_point).
popData %>%
select(state, contains("_mu")) %>%
pivot_longer(-state) %>%
group_by(state) %>%
mutate(pct65Plus=sum(ifelse(name=="pop65Plus_mu", value, 0))/sum(value)) %>%
ungroup() %>%
ggplot(aes(x=fct_reorder(state, pct65Plus), y=value)) +
geom_col(aes(fill=name), position="fill") +
coord_flip() +
scale_fill_discrete("") +
labs(y="Proportion of population", x=NULL, title="Population breakout by state")
## Warning: Removed 4 rows containing missing values (geom_col).
vaxImplied_210712 %>%
filter(state=="US") %>%
select(state, date, starts_with("vax")) %>%
pivot_longer(-c(state, date)) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(group=name, color=name))
vaxImplied_210712 %>%
filter(state=="US") %>%
select(state, date, starts_with("vax")) %>%
pivot_longer(-c(state, date)) %>%
mutate(eq0=(value==0), lt0=(value<0)) %>%
filter(value<=0) %>%
group_by(eq0, lt0, name) %>%
summarize(across(date, .fns=list(min=min, max=max)), .groups="drop")
## # A tibble: 5 x 5
## eq0 lt0 name date_min date_max
## <lgl> <lgl> <chr> <date> <date>
## 1 FALSE TRUE vax1217 2021-03-05 2021-05-12
## 2 TRUE FALSE vax0011 2020-12-13 2021-03-04
## 3 TRUE FALSE vax1217 2020-12-13 2021-03-04
## 4 TRUE FALSE vax1864 2020-12-13 2021-03-04
## 5 TRUE FALSE vax65Plus 2020-12-13 2021-03-04
Population estimates are generally consistent by state across dates, with the greatest variability in the 12-17 age estimates (expected since it is the smallest group where rounded percent vaccinated would have the most impact).
Distributions by age and state appear reasonable.
There has clearly been a change in tracking where fully vaccinated are tracked using age buckets:
Next steps are to modify code so that subtotal statistics by age bucket are used only when where appropriate.
The availability of fields for state ‘US’ (full nation) is explored:
vaxProcessed_210712 %>%
filter(state=="US") %>%
pivot_longer(-c(state, date)) %>%
mutate(valType=case_when(value < 0 ~ "red", value==0 ~ "orange", value > 0 ~ "green")) %>%
ggplot(aes(x=date, y=fct_reorder(name, valType=="green", .fun=sum), fill=valType)) +
geom_tile() +
scale_fill_identity() +
labs(x=NULL, y=NULL, title="Data availability by metric", subtitle="Red is negative, orange is zero")
In the early months, data are available only for administration. The “series complete” metrics are introduced later, with the 12Plus bucket added even later as authorizations for use in ages 12-17 were added.
A comparison of states/DC to US is made for each of the key metrics:
vaxProcessed_210712 %>%
mutate(stateType=case_when(state=="US" ~ "US", state %in% c(state.abb, "DC") ~ "state/DC", TRUE ~ "other")) %>%
group_by(stateType, date, MMWR_week) %>%
summarize(across(where(is.numeric), .fns=sum), .groups="drop") %>%
pivot_longer(-c(stateType, date, MMWR_week)) %>%
filter(!(str_detect(name, "Per|Pct"))) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(group=stateType, color=stateType)) +
facet_wrap(~name, scales="free_y")
In general, the sum of the states and DC are close to the total for US. Per capita and percentage metrics cannot be summed and were not compared directly.
Next steps are to adapt the population splits to account for the variable timing of initial data availability A heuristic can likely be used for the split of 65Plus in the early days, with 12Plus and 18Plus assumed to be equal (no usage in 0-17 group) prior to age being broken out.
An assumption is made that Series_Complete_Yes maps to the oldest group still left to populate when data breakouts are incomplete:
vaxImplied_210712_v2 <- vaxProcessed_210712 %>%
mutate(popTot=100*Series_Complete_Yes/Series_Complete_Pop_Pct,
pop65Plus=100*Series_Complete_65Plus/Series_Complete_65PlusPop_Pct,
pop18Plus=100*Series_Complete_18Plus/Series_Complete_18PlusPop_Pct,
pop12Plus=100*Series_Complete_12Plus/Series_Complete_12PlusPop_Pct,
pop1864=pop18Plus-pop65Plus,
pop1217=pop12Plus-pop18Plus,
pop0011=popTot-pop12Plus,
vax65Plus=Series_Complete_65Plus,
vax1864=Series_Complete_18Plus-Series_Complete_65Plus,
vax1217=ifelse(Series_Complete_12Plus>0, Series_Complete_12Plus, Series_Complete_Yes)-Series_Complete_18Plus,
vax0011=Series_Complete_Yes-vax65Plus-vax1864-vax1217
)
popData_v2 <- vaxImplied_210712_v2 %>%
filter(state %in% c(state.abb, "DC", "US")) %>%
group_by(state) %>%
summarize(across(.cols=c(popTot, pop65Plus, pop1864, pop1217, pop0011),
.fns=list(mu=~mean(.x, na.rm=TRUE),
sdmu=~sd(.x, na.rm=TRUE)/mean(.x, na.rm=TRUE),
rangemu=~diff(range(.x, na.rm=TRUE)/mean(.x, na.rm=TRUE))
)
),
.groups="drop"
)
popData_v2 %>%
select(state, contains("_rangemu")) %>%
pivot_longer(-state) %>%
ggplot(aes(x=fct_reorder(state, value, .fun=max), y=value)) +
geom_point() +
coord_flip() +
facet_wrap(~name, nrow=1) +
labs(y="Range divided by mean",
x=NULL,
title="Consistency of population estimates by subgroup and state"
)
popData_v2 %>%
select(state, contains("_mu"), -contains("popTot")) %>%
pivot_longer(-state) %>%
group_by(state) %>%
mutate(pct65Plus=sum(ifelse(name=="pop65Plus_mu", value, 0))/sum(value)) %>%
ungroup() %>%
ggplot(aes(x=fct_reorder(state, pct65Plus), y=value)) +
geom_col(aes(fill=name), position="fill") +
coord_flip() +
scale_fill_discrete("") +
labs(y="Proportion of population", x=NULL, title="Population breakout by state")
vaxImplied_210712_v2 %>%
filter(state=="US") %>%
select(state, date, starts_with("vax")) %>%
pivot_longer(-c(state, date)) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(group=name, color=name))
vaxImplied_210712_v2 %>%
filter(state=="US") %>%
select(state, date, starts_with("vax")) %>%
pivot_longer(-c(state, date)) %>%
mutate(eq0=(value==0), lt0=(value<0)) %>%
filter(value<=0) %>%
group_by(eq0, lt0, name) %>%
summarize(across(date, .fns=list(min=min, max=max)), .groups="drop")
## # A tibble: 4 x 5
## eq0 lt0 name date_min date_max
## <lgl> <lgl> <chr> <date> <date>
## 1 TRUE FALSE vax0011 2020-12-13 2021-05-12
## 2 TRUE FALSE vax1217 2020-12-13 2021-03-04
## 3 TRUE FALSE vax1864 2020-12-13 2021-03-04
## 4 TRUE FALSE vax65Plus 2020-12-13 2021-03-04
Data appear reasonable for further use, though with some anomalies still related to the breakouts by age. Metrics per million on a rolling-7 basis are created:
popDataUse <- popData_v2 %>%
filter(state %in% c(state.abb, "DC")) %>%
select(state, contains("_mu")) %>%
pivot_longer(-state) %>%
mutate(ageGroup=stringr::str_replace_all(name, "pop|_mu", "")) %>%
rename(pop=value) %>%
select(state, ageGroup, pop)
vaxDataUse <- vaxImplied_210712_v2 %>%
filter(state %in% c(state.abb, "DC")) %>%
select(state, date, vaxTot=Series_Complete_Yes, starts_with("vax")) %>%
pivot_longer(-c(state, date)) %>%
mutate(ageGroup=stringr::str_replace_all(name, "vax", "")) %>%
rename(vax=value) %>%
select(state, date, ageGroup, vax)
popVaxData <- vaxDataUse %>%
inner_join(popDataUse, by=c("state", "ageGroup")) %>%
mutate(vaxpct=vax/pop) %>%
arrange(state, ageGroup, date) %>%
group_by(state, ageGroup) %>%
helperRollingAgg(origVar="vaxpct", newName="vaxpct7") %>%
ungroup()
popVaxData %>%
filter(!is.na(vaxpct7)) %>%
ggplot(aes(x=date, y=vaxpct7)) +
geom_line(aes(group=state, color=state.region[match(state, state.abb)]), alpha=0.5) +
lims(y=c(0, 1)) +
facet_wrap(~ageGroup) +
labs(title="Percent Fully Vaccinated", x=NULL, y="Rolling 7 'Series Complete' percentage") +
scale_color_discrete("Census\nRegion")
popVaxData %>%
filter(!is.na(vaxpct7)) %>%
ggplot(aes(x=date, y=vaxpct7)) +
geom_line(aes(group=ageGroup, color=ageGroup)) +
lims(y=c(0, 1)) +
facet_wrap(~state) +
labs(title="Percent Fully Vaccinated", x=NULL, y="Rolling 7 'Series Complete' percentage") +
scale_color_discrete("Age")
Next steps are to incorporate these steps as a reproducible function.
The function readQCRawCDCDaily() is copied and applied:
# Function to read and check a raw data file
readQCRawCDCDaily <- function(fileName,
writeLog=NULL,
ovrwriteLog=TRUE,
dfRef=NULL,
urlType=NULL,
url=NULL,
getData=TRUE,
ovrWriteDownload=FALSE,
vecRename=NULL,
selfList=NULL,
fullList=NULL,
uniqueBy=NULL,
step3Group=NULL,
step3Vals=NULL,
step4KeyVars=NULL,
step5PlotItems=NULL,
step6AggregateList=NULL,
inferVars=list("url"=urlMapper,
"vecRename"=renMapper,
"selfList"=selfListMapper,
"fullList"=fullListMapper,
"uniqueBy"=uqMapper,
"step3Group"=checkControlGroupMapper,
"step3Vals"=checkControlVarsMapper,
"step4KeyVars"=checkSimilarityMapper,
"step5PlotItems"=plotSimilarityMapper,
"step6AggregateList"=keyAggMapper
)
) {
# FUNCTION ARGUMENTS
# fileName: the location where downloaded data either is, or will be, stored
# writeLog: the external file location for printing (NULL means use the main log stdout)
# ovrwriteLog: boolean, if using an external log, should it be started from scratch (overwritten)?
# dfRef: a reference data frame for comparison (either NULL or NA means do not run comparisons)
# urlType: character vector that can be mapped using urlMapper and keyVarMapper
# url: direct URL passed as character string
# NOTE that if both url and urlType are NULL, no file will be downloaded
# getData: boolean, should an attempt be made to get new data using urlType or url?
# ovrWriteDownload: boolean, if fileName already exists, should it be overwritten?
# vecRename: vector for renaming c('existing name'='new name'), can be any length from 0 to ncol(df)
# NULL means infer from urlType, if not available there use c()
# selfList: list for functions to apply to self, list('variable'=fn) will apply variable=fn(variable)
# processed in order, so more than one function can be applied to self
# NULL means infer from urlType, if not available in mapping file use list()
# fullList: list for general functions to be applied, list('new variable'=expression(code))
# will create 'new variable' as eval(expression(code))
# for now, requires passing an expression
# NULL means infer from urlType, use list() if not in mapping file
# uniqueBy: combination of variables for checking uniqueness
# NULL means infer from data, keep as NULL (meaning use-all) if cannot be inferred
# step3Group: variable to be used as the x-axis (grouping) for step 3 plots
# NULL means infer from data
# step3Vals: values to be plotted on the y-axis for step 3 plots
# NULL means infer from data
# step4KeyVars: list of parameters to be passed as keyVars= in step 4
# NULL means infer from urlType
# step5PlotItems: items to be plotted in step 5
# NULL means infer from urlType
# step6AggregateList: drives the elements to be passed to compareAggregate() and flagLargeDelta()
# NULL means infer from urlType
# inferVars: vector of c('variable'='mapper') for inferring parameter values when passed as NULL
# Step 0a: Use urlType to infer key variables if passed as NULL
for (vrbl in names(inferVars)) {
mapper <- inferVars[[vrbl]]
if (is.null(get(vrbl))) {
if (urlType %in% names(mapper)) assign(vrbl, mapper[[urlType]])
else if ("default" %in% names(mapper)) assign(vrbl, mapper[["default"]])
}
}
# Step 1: Download a new file (if requested)
if (!is.null(url) & isTRUE(getData)) fileDownload(fileName=fileName, url=url, ovrWrite=ovrWriteDownload)
else cat("\nNo file has been downloaded, will use existing file:", fileName, "\n")
# Step 2: Read file, rename and mutate variables, confirm uniqueness by expected levels
dfRaw <- fileRead(fileName) %>%
colRenamer(vecRename) %>%
colMutater(selfList=selfList, fullList=fullList) %>%
checkUniqueRows(uniqueBy=uniqueBy)
# Step 3: Plot basic control totals for new cases and new deaths by month
dfRaw %>%
checkControl(groupBy=step3Group, useVars=step3Vals, printControls=FALSE, na.rm=TRUE) %>%
helperLinePlot(x=step3Group, y="newValue", facetVar="name", facetScales="free_y", groupColor="name")
# If there is no file for comparison, return the data
if (is.null(dfRef) | if(length(dfRef)==1) is.na(dfRef) else FALSE) return(dfRaw)
# Step 4b: Check similarity of existing and reference file
# ovrWriteLog=FALSE since everything should be an append after the opening text line in step 0
diffRaw <- checkSimilarity(df=dfRaw,
ref=dfRef,
keyVars=step4KeyVars,
writeLog=writeLog,
ovrwriteLog=FALSE
)
# Step 5: Plot the similarity checks
plotSimilarity(diffRaw, plotItems=step5PlotItems)
# Step 6: Plot and report on differences in aggregates
helperAggMap <- function(x) {
h1 <- compareAggregate(df=dfRaw, ref=dfRef, grpVar=x$grpVar, numVars=x$numVars,
sameUniverse=x$sameUniverse, plotData=x$plotData, isLine=x$isLine,
returnDelta=x$returnDelta)
if (isTRUE(x$flagLargeDelta)) {
h2 <- flagLargeDelta(h1, pctTol=x$pctTol, absTol=x$absTol, sortBy=x$sortBy,
dropNA=x$dropNA, printAll=x$printAll
)
if (is.null(writeLog)) print(h2)
else {
cat(nrow(h2), " records", sep="")
txt <- paste0("\n\n***Differences of at least ",
x$absTol,
" and at least ",
round(100*x$pctTol, 3), "%\n\n"
)
printLog(h2, txt=txt, writeLog=writeLog)
}
}
}
lapply(step6AggregateList, FUN=helperAggMap)
cat("\n\n")
# Return the raw data file
dfRaw
}
# Run without downloading data and without a comparison file
vaxRaw_210712_func <- readQCRawCDCDaily(fileName="./RInputFiles/Coronavirus/CDC_vax_downloaded_210712.csv",
url="https://data.cdc.gov/api/views/unsk-b7fc/rows.csv?accessType=DOWNLOAD",
getData=FALSE,
vecRename=c("Location"="state",
"Date"="date",
"Admin_Per_100K"="Admin_Per_100k"
),
selfList=list("date"=lubridate::mdy),
uniqueBy=c("date", "state"),
step3Group=c("date"),
step3Vals=c("Administered",
"Series_Complete_Yes",
"Series_Complete_12Plus",
"Series_Complete_18Plus",
"Series_Complete_65Plus"
),
inferVars=list()
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/CDC_vax_downloaded_210712.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## Date = col_character(),
## Location = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: date state
While there is double-counting due to the “US” record being included, the general process for a basic file read is working as intended. Next steps are to update the process to allow for comparison to an existing file.
The latest vaccines data are downloaded, with results cached:
urlVaccine <- "https://data.cdc.gov/api/views/unsk-b7fc/rows.csv?accessType=DOWNLOAD"
locVaccine <- "./RInputFiles/Coronavirus/CDC_vax_downloaded_210717.csv"
fileDownload(locVaccine, urlVaccine)
## size isdir mode
## ./RInputFiles/Coronavirus/CDC_vax_downloaded_210717.csv 4406078 FALSE 666
## mtime
## ./RInputFiles/Coronavirus/CDC_vax_downloaded_210717.csv 2021-07-17 08:17:29
## ctime
## ./RInputFiles/Coronavirus/CDC_vax_downloaded_210717.csv 2021-07-17 08:17:26
## atime exe
## ./RInputFiles/Coronavirus/CDC_vax_downloaded_210717.csv 2021-07-17 08:17:29 no
The function readQCRawCDCDaily() is applied using the previous data as the control:
# Run without downloading data and with a comparison file
vaxRaw_210717_func <- readQCRawCDCDaily(fileName="./RInputFiles/Coronavirus/CDC_vax_downloaded_210717.csv",
dfRef=vaxRaw_210712_func,
url="https://data.cdc.gov/api/views/unsk-b7fc/rows.csv?accessType=DOWNLOAD",
getData=FALSE,
vecRename=c("Location"="state",
"Date"="date",
"Admin_Per_100K"="Admin_Per_100k"
),
selfList=list("date"=lubridate::mdy),
uniqueBy=c("date", "state"),
step3Group=c("date"),
step3Vals=c("Administered",
"Series_Complete_Yes",
"Series_Complete_12Plus",
"Series_Complete_18Plus",
"Series_Complete_65Plus"
),
step4KeyVars=list(date=list(label='date', countOnly=TRUE, convChar=TRUE),
state=list(label='state', countOnly=FALSE)
),
step5PlotItems=c("date"),
step6AggregateList=list("l1"=list("grpVar"="date",
"numVars"=c("Administered",
"Series_Complete_Yes",
"Series_Complete_12Plus",
"Series_Complete_18Plus",
"Series_Complete_65Plus"
),
"sameUniverse"=NA,
"plotData"=TRUE,
"isLine"=TRUE,
"returnDelta"=TRUE,
"flagLargeDelta"=TRUE,
"pctTol"=0.01,
"absTol"=1,
"sortBy"=c("name", "pctDelta", "absDelta"),
"dropNA"=TRUE,
"printAll"=TRUE
),
"l3"=list("grpVar"="state",
"numVars"=c("Administered",
"Series_Complete_Yes",
"Series_Complete_12Plus",
"Series_Complete_18Plus",
"Series_Complete_65Plus"
),
"sameUniverse"="date",
"plotData"=TRUE,
"isLine"=FALSE,
"returnDelta"=TRUE,
"flagLargeDelta"=TRUE,
"pctTol"=0.001,
"absTol"=0,
"sortBy"=c("name", "pctDelta", "absDelta"),
"dropNA"=TRUE,
"printAll"=TRUE
)
),
inferVars=list()
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/CDC_vax_downloaded_210717.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## Date = col_character(),
## Location = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: date state
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 5
##
## Checking for similarity of: state
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 1 and at least 1%
##
## [1] date name newValue refValue absDelta pctDelta
## <0 rows> (or 0-length row.names)
##
##
## ***Differences of at least 0 and at least 0.1%
##
## [1] state name newValue refValue absDelta pctDelta
## <0 rows> (or 0-length row.names)
The function works well for reading a raw vaccines data file, running basic checks, and comparing to a previous vaccines data file. Next steps are to adapt the function for processing a vaccines data file.
The function processRawFile() is leveraged:
# Generic function for processing a raw file
processRawFile <- function(df,
vecRename=c(),
vecSelect=NULL,
lstCombo=list(),
lstFilter=list(),
lstExclude=list()
) {
# FUNCTION ARGUMENTS:
# df: the raw data frame or tibble
# vecRename: vector for renaming c('existing name'='new name'), can be any length from 0 to ncol(df)
# vecSelect: vector of columns to select (run after vecRename), NULL means select all columns
# lstCombo: a nested list of combinations to be applied
# each element of the list should include comboVar, uqVars, vecCombo, and fn
# lstFilter: a list for filtering records, of form list("field"=c("allowed values"))
# lstExclude: a list for filtering records, of form list("field"=c("disallowed values"))
# STEP 1: Rename and select variables (selection occurs AFTER renaming)
dfProcess <- df %>%
colRenamer(vecRename=vecRename) %>%
colSelector(vecSelect=vecSelect)
# STEP 2: Combine multiple records to a single record
for (ctr in seq_along(lstCombo)) {
dfProcess <- dfProcess %>%
combineRows(comboVar=lstCombo[[ctr]]$comboVar,
uqVars=lstCombo[[ctr]]$uqVars,
vecCombo=lstCombo[[ctr]]$vecCombo,
fn=lstCombo[[ctr]]$fn
)
}
# STEP 3: Filter records
qcOrig <- dfProcess %>%
summarize(across(where(is.numeric), sum, na.rm=TRUE), n=n()) %>%
mutate(isType="before")
dfProcess <- dfProcess %>%
rowFilter(lstFilter=lstFilter, lstExclude=lstExclude)
# STEP 4: Report on differences
cat("\nColumn sums before and after applying filtering rules:\n")
dfProcess %>%
summarize(across(where(is.numeric), sum, na.rm=TRUE), n=n()) %>%
mutate(isType="after") %>%
bind_rows(qcOrig) %>%
arrange(desc(isType)) %>%
bind_rows(mutate(summarize(., across(where(is.numeric), function(x) (max(x)-min(x))/max(x))),
isType="pctchg"
)
) %>%
select(isType, everything()) %>%
print()
cat("\n")
# Return the processed data file
dfProcess
}
vaxProc_210717_func <- processRawFile(vaxRaw_210717_func,
vecRename=c(),
vecSelect=c("date", "state", "MMWR_week",
"Administered", "Admin_Per_100k",
"Series_Complete_Yes", "Series_Complete_Pop_Pct",
"Series_Complete_12Plus", "Series_Complete_12PlusPop_Pct",
"Series_Complete_18Plus", "Series_Complete_18PlusPop_Pct",
"Series_Complete_65Plus", "Series_Complete_65PlusPop_Pct"
),
lstCombo=list(),
lstFilter=list("state"=c(state.abb, "DC")),
lstExclude=list()
)
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 13
## isType MMWR_week Administered Admin_Per_100k Series_Complete~ Series_Complete~
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 before 2.49e+5 7.14e+10 628437370 2.87e+10 258606.
## 2 after 1.97e+5 3.39e+10 531817808 1.39e+10 218997.
## 3 pctchg 2.10e-1 5.25e- 1 0.154 5.16e- 1 0.153
## # ... with 7 more variables: Series_Complete_12Plus <dbl>,
## # Series_Complete_12PlusPop_Pct <dbl>, Series_Complete_18Plus <dbl>,
## # Series_Complete_18PlusPop_Pct <dbl>, Series_Complete_65Plus <dbl>,
## # Series_Complete_65PlusPop_Pct <dbl>, n <dbl>
vaxProc_210717_func
## # A tibble: 10,965 x 13
## date state MMWR_week Administered Admin_Per_100k Series_Complete_Yes
## <date> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2021-07-16 FL 28 21688774 100983 10167736
## 2 2021-07-16 KS 28 2540782 87213 1249272
## 3 2021-07-16 SC 28 4292812 83376 2045648
## 4 2021-07-16 AR 28 2310080 76548 1062254
## 5 2021-07-16 ND 28 652273 85593 301349
## 6 2021-07-16 MN 28 5979756 106031 2987234
## 7 2021-07-16 DE 28 1065570 109428 501985
## 8 2021-07-16 IA 28 3087945 97872 1543626
## 9 2021-07-16 NV 28 2867707 93103 1330894
## 10 2021-07-16 DC 28 885481 125467 379400
## # ... with 10,955 more rows, and 7 more variables:
## # Series_Complete_Pop_Pct <dbl>, Series_Complete_12Plus <dbl>,
## # Series_Complete_12PlusPop_Pct <dbl>, Series_Complete_18Plus <dbl>,
## # Series_Complete_18PlusPop_Pct <dbl>, Series_Complete_65Plus <dbl>,
## # Series_Complete_65PlusPop_Pct <dbl>
Next steps are to run the per-capita process for conversion of Administered and Series_Complete_Yes based on the same state population data used for cases, deaths, and hospitalizations.
The function createPerCapita() is leveraged:
# Function to extract and format key state data
getStateData <- function(df=readFromRDS("statePop2019"),
renameVars=c("stateAbb"="state", "NAME"="name", "pop_2019"="pop"),
keepVars=c("state", "name", "pop")
) {
# FUNCTION ARGUMENTS:
# df: the data frame containing state data
# renameVars: variables to be renamed, using named list with format "originalName"="newName"
# keepVars: variables to be kept in the final file
# Rename variables where appropriate
names(df) <- ifelse(is.na(renameVars[names(df)]), names(df), renameVars[names(df)])
# Return file with only key variables kept
df %>%
select_at(vars(all_of(keepVars)))
}
useVars <- c("state", "date", "Administered", "Series_Complete_Yes")
vaxPerCap_210717_func <- createPerCapita(select(vaxProc_210717_func, all_of(useVars)),
uqBy=c("state", "date"),
popData=getStateData(),
mapper=c("Administered"="vxapm", "Series_Complete_Yes"="vxcpm"),
)
vaxPerCap_210717_func
## # A tibble: 10,965 x 8
## state date Administered Series_Complete_Yes vxapm vxcpm vxapm7 vxcpm7
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AK 2020-12-14 0 0 0 0 NA NA
## 2 AL 2020-12-14 0 0 0 0 NA NA
## 3 AR 2020-12-14 0 0 0 0 NA NA
## 4 AZ 2020-12-14 0 0 0 0 NA NA
## 5 CA 2020-12-14 0 0 0 0 NA NA
## 6 CO 2020-12-14 0 0 0 0 NA NA
## 7 CT 2020-12-14 0 0 0 0 NA NA
## 8 DC 2020-12-14 0 0 0 0 NA NA
## 9 DE 2020-12-14 0 0 0 0 NA NA
## 10 FL 2020-12-14 0 0 0 0 NA NA
## # ... with 10,955 more rows
vaxPerCap_210717_func %>%
select(state, date, vxapm7, vxcpm7) %>%
pivot_longer(-c(state, date)) %>%
filter(!is.na(value), name=="vxcpm7") %>%
mutate(region=ifelse(state=="DC", "South Atlantic", as.character(state.division)[match(state, state.abb)])) %>%
ggplot(aes(x=date, y=value/1000000)) +
geom_line(aes(group=state), alpha=0.25) +
geom_line(data=~summarize(group_by(., region, date), value=median(value), .groups="drop"),
aes(color=region)
) +
facet_wrap(~region) +
lims(y=c(0, 1)) +
labs(x=NULL,
y="Proportion Fully Vaccinated (of total population)",
title="Evolution of fully vaccinated by state and census division",
subtitle="Colored line is median in region, gray line is individual states in region"
) +
theme(legend.position="none")
The createPerCapita() function is updated to allow for keeping variables without calculating per-million or rolling-7 aggregates:
# Generic function to create per-capita metrics using an existing file and source of population data
createPerCapita <- function(lst,
uqBy,
popData,
mapper,
asIsVars=c(),
lstSortBy=uqBy,
fnJoin=dplyr::full_join,
popJoinBy="state",
popVar="pop",
k=7,
mult=1000000,
...
) {
# FUNCTION ARGUMENTS:
# lst: A list containing one or more files to be joined OR a data frame that is already joined
# uqBy: character string that the input file is unique by (will be the join keys if a list is passed)
# popData: file containing population data that can be joined to the processed lst
# mapper: mapping file of c('current name'='per capita name') for mapping variables
# asIsVars: variables to be kept, but without creating pm or pm7
# lstSortBy: the sorting that should be used for creating rolling metrics
# fnJoin: The function to be used for joining files
# popJoinBy: character string for the variable(s) to be used in joining popData to lst
# popVar: character string for the variable in popData that represents population
# k: time perior for rolling aggregations
# mult: the unit for the per-capita data (default 1 million means make metrics per million)
# ...: other arguments to be passed to combineFiles()
# Step 1: If a list has been passed, use a joining process to create a data frame
if ("list" %in% class(lst)) lst <- combineFiles(lst, byVars=uqBy, fn=fnJoin, ...)
# Step 2: Sort the data using sortBy
df <- dplyr::arrange(lst, across(all_of(lstSortBy)))
# Step 3: Check that all variables other than uqBy and asIsVars can be mapped using mapper
keyVars <- setdiff(names(df), c(uqBy, asIsVars))
if (any(isFALSE(keyVars %in% mapper))) stop("\nVariable is missing in per capita mapper file\n")
# Step 4: Run the per capita mapping process
df <- helperMakePerCapita(df,
mapVars=mapper[keyVars],
popData=popData,
k=k,
byVar=popJoinBy,
sortVar=setdiff(lstSortBy, popJoinBy),
popVar=popVar,
mult=mult
)
# Return the data frame
df
}
The updated process is then run, keeping the breakout for 65+ and 18+:
uqVars <- c("state", "date")
perCapVars <- c("Administered", "Series_Complete_Yes")
asIsVars <- c("Series_Complete_65Plus", "Series_Complete_65PlusPop_Pct",
"Series_Complete_18Plus", "Series_Complete_18PlusPop_Pct",
"Admin_Per_100k", "Series_Complete_Pop_Pct"
)
vaxPerCap_210717_func_v2 <- createPerCapita(select(vaxProc_210717_func, all_of(c(uqVars, perCapVars, asIsVars))),
uqBy=uqVars,
asIsVars=asIsVars,
popData=getStateData(),
mapper=c("Administered"="vxapm", "Series_Complete_Yes"="vxcpm")
) %>%
colRenamer(c("Series_Complete_Yes"="vxc",
"Administered"="vxa",
"Series_Complete_Pop_Pct"="vxcpoppct",
"Series_Complete_65Plus"="vxcgte65",
"Series_Complete_65PlusPop_Pct"="vxcgte65pct",
"Series_Complete_18Plus"="vxcgte18",
"Series_Complete_18PlusPop_Pct"="vxcgte18pct"
)
)
vaxPerCap_210717_func_v2
## # A tibble: 10,965 x 14
## state date vxa vxc vxcgte65 vxcgte65pct vxcgte18 vxcgte18pct
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AK 2020-12-14 0 0 0 0 0 0
## 2 AL 2020-12-14 0 0 0 0 0 0
## 3 AR 2020-12-14 0 0 0 0 0 0
## 4 AZ 2020-12-14 0 0 0 0 0 0
## 5 CA 2020-12-14 0 0 0 0 0 0
## 6 CO 2020-12-14 0 0 0 0 0 0
## 7 CT 2020-12-14 0 0 0 0 0 0
## 8 DC 2020-12-14 0 0 0 0 0 0
## 9 DE 2020-12-14 0 0 0 0 0 0
## 10 FL 2020-12-14 0 0 0 0 0 0
## # ... with 10,955 more rows, and 6 more variables: Admin_Per_100k <dbl>,
## # vxcpoppct <dbl>, vxapm <dbl>, vxcpm <dbl>, vxapm7 <dbl>, vxcpm7 <dbl>
# Check consistency of 'Admin_Per_100k' and 'vxapm'
vaxPerCap_210717_func_v2 %>%
filter(date==max(date)) %>%
ggplot(aes(x=Admin_Per_100k, y=vxapm)) +
geom_text(aes(label=state)) +
geom_abline(slope=10, intercept=0, lty=2) +
labs(x="Raw data administered per 100k",
y="Function-calculated adminsitered per million",
title="Consistency of raw data and function-calculated per capita data",
subtitle="Dotted line is per-million at 10x per-100k (expected)"
)
# Check consistency of 'vxcpoppct' and 'vxcpm'
vaxPerCap_210717_func_v2 %>%
filter(date==max(date)) %>%
ggplot(aes(x=vxcpoppct, y=vxcpm)) +
geom_text(aes(label=state)) +
geom_abline(slope=10000, intercept=0, lty=2) +
labs(x="Raw data percent of population completely vaccinated",
y="Function-calculated completely vaccinated per million",
title="Consistency of raw data and function-calculated per capita data",
subtitle="Dotted line is per-million at 10,000x per-100 (expected)"
)
The raw data and per-capita totals are aligned, suggesting that population estimates used in the datasets are very similar (functions use 2019 estimates as per getStateData()).
The colMutater() function is added to include:
Variables are added as follows:
# Conversions for 18-64 and 0-17
subGroupList <- list("vxc1864"=expression(vxcgte18-vxcgte65),
"vxc0017"=expression(vxc-vxcgte18)
)
# Conversions for per-day
perDayFunc <- function(x) ifelse(row_number()==1, x, ifelse(lag(x)==0, 0, x-lag(x)))
perDayList <- list("vxa_perday"=expression(perDayFunc(vxa)),
"vxc_perday"=expression(perDayFunc(vxc)),
"vxcgte65_perday"=expression(perDayFunc(vxcgte65)),
"vxc1864_perday"=expression(perDayFunc(vxc1864)),
"vxc0017_perday"=expression(perDayFunc(vxc0017))
)
vaxPerCap_210717_func_v3 <- vaxPerCap_210717_func_v2 %>%
colMutater(fullList=subGroupList) %>%
arrange(date, state) %>%
group_by(state) %>%
colMutater(fullList=perDayList) %>%
ungroup()
# Check that files are identical for same variables
sapply(names(vaxPerCap_210717_func_v2),
FUN=function(x) all.equal(vaxPerCap_210717_func_v2[[x]], vaxPerCap_210717_func_v3[[x]])
) %>%
t() %>%
t()
## [,1]
## state TRUE
## date TRUE
## vxa TRUE
## vxc TRUE
## vxcgte65 TRUE
## vxcgte65pct TRUE
## vxcgte18 TRUE
## vxcgte18pct TRUE
## Admin_Per_100k TRUE
## vxcpoppct TRUE
## vxapm TRUE
## vxcpm TRUE
## vxapm7 TRUE
## vxcpm7 TRUE
# Plot evolution of vaccines by age
vaxPerCap_210717_func_v3 %>%
select(date, vxc, vxcgte65, vxc1864, vxc0017) %>%
group_by(date) %>%
summarize(across(.fns=sum)) %>%
pivot_longer(-date) %>%
ggplot(aes(x=date)) +
geom_point(data=~filter(., name=="vxc"), aes(y=value/1000000)) +
geom_col(data=~filter(., name!="vxc"), aes(y=value/1000000, fill=name), position="stack") +
labs(x=NULL,
y="Completely Vaccinated (millions)",
title="Evolution of fully vaccinated by age group",
subtitle="Dots are total people fully vaccinated"
) +
scale_fill_discrete("Age")
# Plot evolution of vaccines administered per day
vaxPerCap_210717_func_v3 %>%
select(date, vxa, vxa_perday) %>%
group_by(date) %>%
summarize(across(where(is.numeric), sum)) %>%
pivot_longer(-date) %>%
group_by(name) %>%
mutate(value7=zoo::rollmean(value, k=7, fill=NA)) %>%
ungroup() %>%
ggplot(aes(x=date)) +
geom_line(aes(y=value/1000000)) +
geom_line(data=~filter(., name=="vxa_perday", !is.na(value7)), aes(y=value7/1000000), color="red", lwd=2) +
facet_wrap(~c("vxa"="Cumulative", "vxa_perday"="Daily")[name], scales="free_y") +
labs(x=NULL,
y="Vaccines Adminsitered (millions)",
title="Evolution of vaccines administered",
subtitle="Red line is rolling 7-day average"
)
Variables appear to be created as intended.
Next, total population is estimated and plots of vaccines administered per capita are created:
# Plot evolution of vaccines administered per day
vaxPerCap_210717_func_v3 %>%
select(state, date, vxa, vxa_perday, Admin_Per_100k) %>%
group_by(state) %>%
mutate(pop=median(100000*vxa/Admin_Per_100k, na.rm=TRUE)) %>%
ungroup() %>%
pivot_longer(-c(state, date, pop)) %>%
group_by(state, pop, name) %>%
mutate(value7=zoo::rollmean(value, k=7, fill=NA)) %>%
ungroup() %>%
ggplot(aes(x=date)) +
geom_line(data=~filter(., name=="vxa_perday", !is.na(value7), state != "NM"),
aes(y=1000*value7/pop, group=state)
) +
facet_wrap(~state) +
labs(x=NULL,
y="Vaccines Adminsitered (per thousand)",
title="Evolution of vaccines administered (rolling 7-day average)"
)
# Plot evolution of vaccines administered (cumulative)
vaxPerCap_210717_func_v3 %>%
select(state, date, vxa, vxa_perday, Admin_Per_100k) %>%
group_by(state) %>%
mutate(pop=median(100000*vxa/Admin_Per_100k, na.rm=TRUE)) %>%
ungroup() %>%
pivot_longer(-c(state, date, pop)) %>%
group_by(state, pop, name) %>%
mutate(value7=zoo::rollmean(value, k=7, fill=NA)) %>%
ungroup() %>%
ggplot(aes(x=date)) +
geom_line(data=~filter(., name=="vxa", !is.na(value)),
aes(y=1000*value/pop, group=state)
) +
geom_hline(yintercept=1000, lty=2) +
facet_wrap(~state) +
labs(x=NULL,
y="Vaccines Adminsitered (per thousand)",
title="Evolution of vaccines administered (cumulative)"
)
Estimates are made for population 65+, 18-64, and 0-17 based on completion percentages:
popEstAgeState <- vaxPerCap_210717_func_v3 %>%
select(state, date, vxcgte65, vxcgte65pct, vxcgte18, vxcgte18pct, vxc, vxcpoppct) %>%
mutate(popgte65=100*vxcgte65/vxcgte65pct, popgte18=100*vxcgte18/vxcgte18pct, pop=100*vxc/vxcpoppct) %>%
group_by(state) %>%
summarize(across(c(popgte65, popgte18, pop), median, na.rm=TRUE), .groups="drop") %>%
mutate(pop1864=popgte18-popgte65, pop0017=pop-popgte18)
popEstAgeState
## # A tibble: 51 x 6
## state popgte65 popgte18 pop pop1864 pop0017
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AK 91583. 551552. 731518. 459969. 179966.
## 2 AL 849784. 3815294. 4903279. 2965510. 1087985.
## 3 AR 523915. 2317203. 3016324. 1793288. 699121.
## 4 AZ 1308603. 5637959. 7278853. 4329356. 1640894.
## 5 CA 5838155. 30623548. 39513431. 24785393. 8889884.
## 6 CO 842448. 4499815. 5758852. 3657367. 1259037.
## 7 CT 630249. 2837921. 3565957. 2207671. 728037.
## 8 DC 87347. 577646. 705734. 490299. 128088.
## 9 DE 188903. 770105. 973906. 581202. 203801.
## 10 FL 4497505. 17248010. 21479412. 12750505. 4231402.
## # ... with 41 more rows
popEstAgeState %>%
summarize(across(where(is.numeric), sum))
## # A tibble: 1 x 5
## popgte65 popgte18 pop pop1864 pop0017
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 54058078. 255206800. 328239240. 201148722. 73032440.
popEstAgeState %>%
select(-popgte18, -pop) %>%
pivot_longer(-state) %>%
group_by(state) %>%
mutate(pctg65=ifelse(name=="popgte65", value, 0)/sum(value)) %>%
ggplot(aes(x=fct_reorder(state, pctg65, .fun=max), y=value)) +
geom_col(aes(fill=name), position="fill") +
coord_flip() +
labs(x=NULL,
y="Proportion",
title="Distribution of population by state",
subtitle="Estimated from reported vaccine completion percentages by sub-group"
) +
scale_fill_discrete("Age Group")